`timescale 1ns / 1ps

module calc_core(
    input wire rst,
    input wire clk,
    input wire [63:0] ins,
    input wire [15:0] xpos, ypos,
    input wire [15:0] x1, x2, x3, y1, y2, y3,
    input wire init,
    output reg mask
);

// sint32 16(integer)16(frac)

function [31:0] saturate_sint32;
    input signed [31:0] x;
    begin
        saturate_sint32 = (x < 32'd0 ? 32'd0 : x > 32'h10000 ? 32'h10000 : x);
    end
endfunction

function [31:0] abs_sint32;
    input signed [31:0] x;
    begin
        abs_sint32 = (x < 32'd0 ? -x : x);
    end
endfunction

reg [63:0] exec;// type(8) tar(6) sat(1) condition(1) src(index(6),order(8),abs(1),neg(1))(3)
reg [31:0][3:0] regfile[63:0];

reg condi;

wire signed [31:0][3:0] res;
reg signed [31:0][3:0] res_ori;

assign res = exec[14] ? saturate_sint32(res_ori) : res_ori;

wire signed [31:0][3:0] src1, src2, src3;
wire signed [31:0][3:0] src1_abs, src2_abs, src3_abs;
wire signed [31:0][3:0] src1_order, src2_order, src3_order;
wire signed [31:0][3:0] src1_index, src2_index, src3_index;

assign src1_index = regfile[exec[21:16]];
assign src2_index = regfile[exec[37:32]];
assign src3_index = regfile[exec[53:48]];

assign src1_order = {src1_index[exec[29:28]], src1_index[exec[27:26]], src1_index[exec[25:24]], src1_index[exec[23:22]]};
assign src2_order = {src2_index[exec[45:44]], src2_index[exec[43:42]], src2_index[exec[41:40]], src2_index[exec[39:38]]};
assign src3_order = {src3_index[exec[61:60]], src3_index[exec[59:58]], src3_index[exec[57:56]], src3_index[exec[55:54]]};

assign src1_abs = exec[30] ? {abs_sint32(src1_order[3]), abs_sint32(src1_order[2]), abs_sint32(src1_order[1]), abs_sint32(src1_order[0])} : src1_order;
assign src2_abs = exec[46] ? {abs_sint32(src2_order[3]), abs_sint32(src2_order[2]), abs_sint32(src2_order[1]), abs_sint32(src2_order[0])} : src2_order;
assign src3_abs = exec[62] ? {abs_sint32(src3_order[3]), abs_sint32(src3_order[2]), abs_sint32(src3_order[1]), abs_sint32(src3_order[0])} : src3_order;

assign src1 = exec[31] ? {-src1_abs[3], -src1_abs[2], -src1_abs[1], -src1_abs[0]} : src1_abs;
assign src2 = exec[47] ? {-src2_abs[3], -src2_abs[2], -src2_abs[1], -src2_abs[0]} : src2_abs;
assign src3 = exec[63] ? {-src3_abs[3], -src3_abs[2], -src3_abs[1], -src3_abs[0]} : src3_abs;

reg signed [31:0] g1, g2, g3;

always @(posedge init) begin
    fork
        g1 = (x3 - xpos) * (y2 - ypos) - (x2 - xpos) * (y3 - ypos);
        g2 = (x1 - xpos) * (y3 - ypos) - (x3 - xpos) * (y1 - ypos);
        g3 = (x2 - xpos) * (y1 - ypos) - (x1 - xpos) * (y2 - ypos);
    join
    @(negedge init);
    mask = (g1 >= 0) && (g2 >= 0) && (g3 >= 0);
    if (mask) begin
        condi <= 1'b0;
        $display("Core %d %d Begin", xpos, ypos);
    end
end

function automatic [7:0] index_char;
    input [1:0] index;
    begin
        case(index)
        2'b00: index_char = "x";
        2'b01: index_char = "y";
        2'b10: index_char = "z";
        2'b11: index_char = "w";
        endcase
    end
endfunction

function automatic [31:0] index_str;
    input [7:0] index;
    begin
        case(index[1:0])
        2'b00: index_str[7:0] = "x";
        2'b01: index_str[7:0] = "y";
        2'b10: index_str[7:0] = "z";
        2'b11: index_str[7:0] = "w";
        endcase
        case(index[3:2])
        2'b00: index_str[15:8] = "x";
        2'b01: index_str[15:8] = "y";
        2'b10: index_str[15:8] = "z";
        2'b11: index_str[15:8] = "w";
        endcase
        case(index[5:4])
        2'b00: index_str[23:16] = "x";
        2'b01: index_str[23:16] = "y";
        2'b10: index_str[23:16] = "z";
        2'b11: index_str[23:16] = "w";
        endcase
        case(index[7:6])
        2'b00: index_str[31:24] = "x";
        2'b01: index_str[31:24] = "y";
        2'b10: index_str[31:24] = "z";
        2'b11: index_str[31:24] = "w";
        endcase
    end
endfunction

always @(posedge clk or negedge rst) begin
    if (!rst) mask <= 1'b0;
    else if (mask) begin
        exec = ins;
        case (exec[7:0])
            8'h00: $display("Core %d %d nop", xpos, ypos);
            8'h01: begin
                $display("Core %d %d add r%02d,r%02d.%s,r%02d.%s", xpos, ypos, exec[13:8], exec[21:16], index_str(exec[29:22]), exec[37:32], index_str(exec[45:38]));
                res_ori = {
                    src1[3] + src2[3],
                    src1[2] + src2[2],
                    src1[1] + src2[1],
                    src1[0] + src2[0]
                };
                regfile[exec[13:8]] = res;
            end
            8'h02: begin
                $display("Core %d %d sub r%02d,r%02d.%s,r%02d.%s", xpos, ypos, exec[13:8], exec[21:16], index_str(exec[29:22]), exec[37:32], index_str(exec[45:38]));
                res_ori = {
                    src1[3] - src2[3],
                    src1[2] - src2[2],
                    src1[1] - src2[1],
                    src1[0] - src2[0]
                };
                regfile[exec[13:8]] = res;
            end
            8'h03: begin
                $display("Core %d %d mul r%02d,r%02d.%s,r%02d.%s", xpos, ypos, exec[13:8], exec[21:16], index_str(exec[29:22]), exec[37:32], index_str(exec[45:38]));
                res_ori = {
                    (src1[3] * src2[3]) >>> 16,
                    (src1[2] * src2[2]) >>> 16,
                    (src1[1] * src2[1]) >>> 16,
                    (src1[0] * src2[0]) >>> 16
                };
                regfile[exec[13:8]] = res;
            end
            8'h04: begin
                $display("Core %d %d div r%02d,r%02d.%s,r%02d.%s", xpos, ypos, exec[13:8], exec[21:16], index_str(exec[29:22]), exec[37:32], index_str(exec[45:38]));
                res_ori = {
                    (src1[3] <<< 16) / src2[3],
                    (src1[2] <<< 16) / src2[2],
                    (src1[1] <<< 16) / src2[1],
                    (src1[0] <<< 16) / src2[0]
                };
                regfile[exec[13:8]] = res;
            end
            8'h05: begin
                $display("Core %d %d def r%02d.%c,%f", xpos, ypos, exec[13:8], index_char(exec[15:14]),exec[47:16]/65536.0);
                regfile[exec[13:8]][exec[15:14]] <= exec[47:16];
            end
            default: $display("Core %d %d #UD %x", xpos, ypos, exec[7:0]);
        endcase
    end
end

endmodule

module calc_block #(
    parameter X = 8,
    parameter Y = 8
)
(
    input wire rst,
    input wire clk,
    input wire [15:0] xst, yst,
    input wire [15:0] x1, x2, x3, y1, y2, y3,
    output reg busy
);

reg [63:0] prog[255:0];
reg [7:0] pc, pcend;
reg init;

wire [63:0] ins;
wire mask[7:0][7:0];

assign ins = prog[pc];

genvar i, j;

generate
    for (i = 0; i < X; i = i + 1) begin
        for (j = 0; j < Y; j = j + 1) begin
            calc_core core_inst(
                .rst(rst),
                .clk(clk),
                .ins(ins),
                .xpos({xst[12:0], i[2:0]}), .ypos({yst[12:0], j[2:0]}),
                .x1(x1), .x2(x2), .x3(x3),
                .y1(y1), .y2(y2), .y3(y3),
                .init(init),
                .mask(mask[i][j])
            );
        end
    end
endgenerate

task start;
    begin
        pc <= 8'd0;
        pcend <= 8'd3;
        prog[0] <= 64'h000006C106C00101;
        prog[1] <= 64'h000006C106C00002;
        prog[2] <= 64'h000006C106C00103;
        prog[3] <= 64'h0000000000000000;
        init <= 1'b1;
        @(negedge clk);
        init = 1'b0;
        busy = 1'b1;
    end
endtask

always @(posedge clk or negedge rst) begin
    if (!rst) begin
        busy <= 1'b0;
        init <= 1'b0;
    end
    else if (busy) begin
        if (pc == pcend) begin
            busy <= 1'b0;
        end
        else begin
            @(negedge clk);
            pc <= pc + 1;
        end
    end
end

endmodule

module scheduler(
    input wire rst,
    input wire clk,
    input wire signed [15:0] x1, x2, x3, y1, y2, y3
);

reg [15:0] xst, yst;

calc_block block_inst(
    .rst(rst),
    .clk(clk),
    .busy(busy),
    .x1(x1), .x2(x2), .x3(x3),
    .y1(y1), .y2(y2), .y3(y3),
    .xst(xst),
    .yst(yst)
);

function automatic [15:0] min_sint16;
    input signed [15:0] a, b;
    begin
        min_sint16 = (a < b ? a : b);
    end
endfunction

function automatic [15:0] min3_sint16;
    input signed [15:0] a, b, c;
    begin
        min3_sint16 = min_sint16(a, min_sint16(b, c));
    end
endfunction

function automatic [15:0] max_sint16;
    input signed [15:0] a, b;
    begin
        max_sint16 = (a < b ? b : a);
    end
endfunction

function automatic [15:0] max3_sint16;
    input signed [15:0] a, b, c;
    begin
        max3_sint16 = max_sint16(a, max_sint16(b, c));
    end
endfunction

function automatic block_out_line;
    input signed [15:0] x, y;
    input signed [15:0] x1, x2, y1, y2;
    begin
        block_out_line = (
            ((x1 - x2) * y - (y1 - y2) * x + x2 * y1 - x1 * y2 < 0) &&
            ((x1 - x2) * (y + 7) - (y1 - y2) * x + x2 * y1 - x1 * y2 < 0) &&
            ((x1 - x2) * y - (y1 - y2) * (x + 7) + x2 * y1 - x1 * y2 < 0) &&
            ((x1 - x2) * (y + 7) - (y1 - y2) * (x + 7) + x2 * y1 - x1 * y2 < 0)
        );
    end
endfunction

function automatic block_out_tri;
    input signed [15:0] x, y;
    begin
        block_out_tri = (
            block_out_line(x, y, x1, x2, y1, y2) ||
            block_out_line(x, y, x2, x3, y2, y3) ||
            block_out_line(x, y, x3, x1, y3, y1)
        );
    end
endfunction

reg [15:0] xi, yi, xed, yed, ymem;

task fire_triangle;
    begin
        xi = (max_sint16(min3_sint16(x1, x2, x3), 16'd0) >> 3);
        yi = (max_sint16(min3_sint16(y1, y2, y3), 16'd0) >> 3);
        xed = (min_sint16(max3_sint16(x1, x2, x3), 16'd127) >> 3) + 1;
        yed = (min_sint16(max3_sint16(y1, y2, y3), 16'd63) >> 3) + 1;
        $display("Triangle Rect (%d,%d)(%d,%d)", xi, yi, xed, yed);
        ymem <= yi;
        while (xi < xed) begin
            while (yi < yed) begin
                if (!block_out_tri(xi, yi)) begin
                    $display("Start Rect (%d,%d)", xi, yi);
                    xst <= xi;
                    yst <= yi;
                    @(posedge clk);
                    block_inst.start();
                    @(negedge busy);
                end
                yi = yi + 1;
            end
            yi = ymem;
            xi = xi + 1;
        end
    end
endtask

endmodule